
**********
* Readme *
**********

* This script models expected wages with two non-parametric corrections:
* [1] Heckman's (1979) correction for selection (error in wage equation possibly correlated with error in prob of being observed as wage worker)
* [2] Duan's (1983) correction for retransformatio bias (required because exp(E[xb]) != E(y) after reg ln_y x)


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/4_1_estimate_potential_wages_${today}.smcl", replace


**************
* Import POF *
**************

use "${analysis}/data/2_2_pof_clean.dta", clear
svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered) vce(linearized) 


* Covariates
************
* Key idea: what's the expected average wage for people similar to me?

* Covariates for the wage model
unab wage_covariates_all : rg_* ae_* region_* // all
unab wage_ref_groups : rg_1_nonwhite_female ae_1_1 region_SP_c // reference group: non white female, no education, Sao Paulo
global wage_covariates : list wage_covariates_all - wage_ref_groups // final list of regressors

* The selection issue: wages are not observed for a random sample of the population, but for those who have chosen to be employees.
* Unbiased conditional counterfactual wages require a correction that takes into account the probability of being observed as employee.

* Define extended set of covariates for selection correction
unab covariates_all : rg_* ae_* at_* fp_* n_kids n_youngs n_adults n_seniors region_* // all
unab ref_groups : rg_1_nonwhite_female ae_1_1 fp_h_wp_nk region_SP_c // reference group: non white female, no education, head w partner no kids, Sao Paulo
global covariates : list covariates_all - ref_groups // final list of regressors


************************************************
* Baseline estimation: Heckman selection model *
************************************************

* Preferred definition (domestic workers as OAWs)
tab work_state work_state_ibge
gen employee = (work_state_name == "Employee")

replace ln_winc_employee = .
replace ln_winc_employee = ln_winc if work_state_name == "Employee"

* No correction for selection
svy: reg ln_winc_employee $covariates
estimates store ols

* Heckman selection model without exclusion restrictions
svy: heckman ln_winc_employee $wage_covariates, select(employee = $wage_covariates)
estimates store NoRestrictions

* Heckman selection model with exclusion restrictions
svy: heckman ln_winc_employee $wage_covariates, select(employee = $covariates)
estimates store YesRestrictions
estimates save "${analysis}/results/estimations/4_1_mod_potential_wage.ster", replace

* Generalized Hausman test: do the common coefficients of the earnings equation change when a selection correctionn is added?
suest YesRestrictions ols
test [YesRestrictions_ln_winc_employee = ols], common

* P-value ~ 0, reject the null that coeffs are the same. 

* Generalized Hausman test: do the common coefficients of the earnings equation change when the exclusion restriction is added?
suest YesRestrictions NoRestrictions
test [YesRestrictions_ln_winc_employee = NoRestrictions_ln_winc_employee], common

* P-value ~ 0, reject the null that coeffs are the same. 


***************************
* Fitted values: baseline *
***************************

* Correction for log specification: since we are interested in the level of wages, it's necessary to retransform it.
* As exp(xb) is known to be biased, we apply the smearing correction from Duan (1983), which is flexible, easy to implement and robust to departures from normal errors.

estimates use "${analysis}/results/estimations/4_1_mod_potential_wage.ster"
predictnl exp_ei = exp(ln_winc_employee - xb()) // residuals
mean exp_ei [pweight = pweight]
matrix temp = e(b)
scalar mean_exp_e = temp[1,1] // correction term

estimates use "${analysis}/results/estimations/4_1_mod_potential_wage.ster"
predict est_ln_wage_0, xb // fitted log wage
predictnl est_wage_0 = exp(xb()) * scalar(mean_exp_e) // unbiased fitted wage level
drop exp_ei

* Summary of baseline results
svy, subpop(if work_state_name == "Employee"):           mean winc est_wage_0
svy, subpop(if work_state_name == "Own-account worker"): mean winc est_wage_0


***************************************************************
* Baseline estimation: Heckman selection model FOR OAW INCOME *
***************************************************************

* Preferred definition (domestic workers as OAWs)
gen oaw = (work_state_name == "Own-account worker")
replace ln_winc_oaw = .
replace ln_winc_oaw = ln_winc if work_state_name == "Own-account worker"

* Heckman selection model with exclusion restrictions
svy: heckman ln_winc_oaw $wage_covariates, select(oaw = $covariates)
estimates save "${analysis}/results/estimations/4_1_mod_potential_oaw_income.ster", replace


* Fitted values
***************

estimates use "${analysis}/results/estimations/4_1_mod_potential_oaw_income.ster"
predictnl exp_winc_oaw_ei = exp(ln_winc_oaw - xb()) // residuals
mean exp_winc_oaw_ei [pweight = pweight]
matrix temp = e(b)
scalar mean_exp_winc_oaw_e = temp[1,1] // correction term

estimates use "${analysis}/results/estimations/4_1_mod_potential_oaw_income.ster"
predict est_ln_oaw_inc_0, xb // fitted log income
predictnl est_oaw_inc_0 = exp(xb()) * scalar(mean_exp_winc_oaw_e) // unbiased fitted income level
drop exp_winc_oaw_ei

* Summary of baseline results
svy, subpop(if work_state_name == "Employee"):           mean winc est_oaw_inc_0
svy, subpop(if work_state_name == "Own-account worker"): mean winc est_oaw_inc_0


***************************************************
* Residuals from WAGES and OAW INCOME estimations *
***************************************************

gen res_winc_employee = winc - est_wage_0    if work_state_name == "Employee"
gen res_winc_oaw      = winc - est_oaw_inc_0 if work_state_name == "Own-account worker"

qui svy, subpop(if work_state_name == "Employee"): mean winc est_wage_0 res_winc_employee
estat sd

qui svy, subpop(if work_state_name == "Own-account worker"): mean winc est_oaw_inc_0 res_winc_oaw
estat sd


****************************************
* Plot the conditional work income gap *
****************************************

* Observed income as oaw minus counterfactual income as wage worker
keep if !missing(winc) & work_state_name == "Own-account worker"
gen cond_inc_gap = winc - est_wage_0
svy: mean cond_inc_gap

gen fweight = round(pweight)
sum cond_inc_gap [fweight = fweight], detail

* Which share at the left side of 0?
numlist "80(0.1)84"
global p_list = "`r(numlist)'"
epctile cond_inc_gap, svy percentiles($p_list) 


* FIGURE 2
**********

graph twoway ///
  (pci 0 0 9.75 0, lc(gs4) lp(shortdash)) ///
  (histogram cond_inc_gap if cond_inc_gap > -2500 & cond_inc_gap < 2000, fcolor(gs10) bin(36) percent), ///
		text(9.50 -80 "{bf: OAW income < potential wage}", place(w) justification(right) box bcolor(white)) ///
		text(8.85 -80 "{it: 81.5% of cases}", place(w) justification(right) box bcolor(white)) ///
		text(9.50  40 "{bf: OAW income > potential wage}", place(e) justification(left) box bcolor(white)) ///
		text(8.85  40 "{it: 18.5% of cases}", place(e) justification(left) box bcolor(white)) ///
		ylabel(0 "0%" 1 "1%" 2 "2%" 3 "3%" 4 "4%" 5 "5%" 6 "6%" 7 "7%" 8 "8%", grid) ///
		xlabel(none -2500 "-2.5k" -2000 "-2k" -1500 "-1.5k" -1000 "-1k" -500 "-500" 0 "0" 500 "500" 1000 "1k" 1500 "1.5k" 2000 "2k") ///
        ytitle("") ///
		xtitle("{bf:Current own-account work income minus potential wage (in R{c $|})}") legend(off) ysize(3) xsize(4.5)

graph export "${analysis}/results/figures/fig2.eps", replace


* Store fitted values for OAW
*****************************
keep  ind_id est_ln_wage_0 est_wage_0
order ind_id est_ln_wage_0 est_wage_0
compress

label var est_ln_wage_0 "Estimated log wage (baseline)" 
label var est_wage_0 "Estimated wage (baseline)"

describe
save "${analysis}/data/4_1_est_potential_wages.dta", replace


****************************
* Estimation output tables *
****************************

use "${analysis}/data/2_2_pof_clean.dta", clear
svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered) vce(linearized) 
keep if ( !missing(ln_winc_employee) | !missing(ln_winc_oaw) )

estimates use "${analysis}/results/estimations/4_1_mod_potential_wage.ster"
estimates store est_wage_main

nlcom ("rho": tanh(_b[/athrho])) ("sigma": exp(_b[/lnsigma])), post
matrix define b = e(b)
matrix define V = e(V)

estimates restore est_wage_main
estadd scalar b_heck_rho    b[1, colnumb(b, "rho")]
estadd scalar b_heck_sigma  b[1, colnumb(b, "sigma")]
estadd scalar sd_heck_rho   sqrt(V[colnumb(V, "rho"), colnumb(V, "rho")])
estadd scalar sd_heck_sigma sqrt(V[colnumb(V, "sigma"), colnumb(V, "sigma")])

estimates use "${analysis}/results/estimations/4_1_mod_potential_wage.ster"
estimates store est_wage_select

global region region_*

* Print
esttab est_wage_main est_wage_select, varwidth(50) ///
  eq(1:2) keep(#1:) cells("b(fmt(3)) se(par fmt(3))") ///
  label mlabels(, titles) nonumbers eqlabels(none) collabel("coef." "s.e.") ///
  drop($region _cons) ///
  refcat( ///
    rg_2_white_female "Ethnicity and gender (ref: Nonwhite female)" ///
    ae_1_2 "Age and education (ref: 14-24, less than prim. school)" ///
    ae_2_1 "[BLANK LINE]" ///
    ae_3_1 "[BLANK LINE]" ///
    ae_4_1 "[BLANK LINE]" ///
    ae_5_1 "[BLANK LINE]" ///
    at_school "Current schooling status (ref: Not currently studying)" ///
    fp_h_wp_wk "Household position (ref: Head, with partner, no kids)" ///
    n_kids "Number of household members by age", nolabel) ///
  stats(b_heck_rho sd_heck_rho b_heck_sigma sd_heck_sigma, ///
	   layout("@ (@)" "@ (@)") ///
		  labels("Errors correlation" "Standard deviation of errors") ///
		  fmt(3 3 3 3))

          
* TABLE 5
*********

esttab est_wage_main est_wage_select using "${analysis}/results/tables/table5.tex", style(tex) fragment replace ///
  eq(1:2) keep(#1:) cells("b(fmt(3)) se(par fmt(3))") ///
  label mlabels(none) nonumbers eqlabels(none) collabel(none) ///
  drop($region _cons) ///
  refcat( ///
    rg_2_white_female   "[nospace]\textit{Ethnicity and gender} [emptycolumns] \\ \hspace{2ex}Nonwhite female" ///
    ae_1_2              "[nospace]\textit{Age and education} [emptycolumns] \\ \hspace{2ex}14-24, less than prim. school" ///
    at_school           "[nospace]\textit{Current schooling status} [emptycolumns] \\ \hspace{2ex}Not currently studying" ///
    fp_h_wp_wk          "[nospace]\textit{Household position} [emptycolumns] \\ \hspace{2ex}Head, with partner, no kids" ///
    n_kids              "[nospace]\textit{Number of household members by age}", nolabel) ///
  varlabels(, /// 
    prefix(\hspace{2ex}) ///
    elist(rg_4_white_male \tabularnewline ae_1_4 \tabularnewline ae_2_4 \tabularnewline ae_3_4 \tabularnewline ae_4_4 \tabularnewline ae_5_4 \tabularnewline at_college \tabularnewline fp_oa \tabularnewline n_seniors \tabularnewline\midrule)) ///
  stats(b_heck_rho sd_heck_rho b_heck_sigma sd_heck_sigma, ///
	   layout("@ (@)" "@ (@)") ///
		  labels("\textit{Heckman selection model ancillary parameters} [emptycolumns] \\ \hspace{2ex}Errors correlation" "\hspace{2ex}Standard deviation of errors") ///
		  fmt(3 3 3 3)) ///
  substitute("\hline" "" ///
             "&" " & " "  " " " "  " " " "  " " " "  " " " ///
             " & & " " & $\cdot$ & " ///
             " & & " " & $\cdot$ & " ///
             " & & " " & $\cdot$ & " ///
             " & \\" " & $\cdot$ \\" ///
             "\hspace{2ex}[nospace]" "" ///
             "[emptycolumns]" "& & & &" ///
             "\textit{Number of household members by age} & $\cdot$ & $\cdot$ & $\cdot$ & $\cdot$ \\" "\textit{Number of household members by age} & & & & \\")

             
*******************************************
* APPENDIX: Domestic workers as employees *
*******************************************

use "${analysis}/data/2_2_pof_clean.dta", clear
svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered) vce(linearized) 

* IBGE default definition (domestic workers as employees)
gen employee = (work_state_ibge_name == "Employee")

replace ln_winc_employee = .
replace ln_winc_employee = ln_winc if work_state_ibge_name == "Employee"

svy: heckman ln_winc_employee $wage_covariates, select(employee = $covariates) 
estimates save "${analysis}/results/estimations/4_1_appx_dom_mod_potential_wage.ster", replace


*****************
* Fitted values *
*****************

* Correction for log specification: since we are interested in the level of wages, it's necessary to retransform it.
* As exp(xb) is known to be biased, we apply the smearing correction from Duan (1983), which is flexible, easy to implement and robust to departures from normal errors.

estimates use "${analysis}/results/estimations/4_1_appx_dom_mod_potential_wage.ster"
predictnl exp_ei = exp(ln_winc_employee - xb()) // residuals
mean exp_ei [pweight = pweight]
matrix temp = e(b)
scalar mean_exp_e = temp[1,1] // correction term

estimates use "${analysis}/results/estimations/4_1_appx_dom_mod_potential_wage.ster"
predict   appx_dom_est_ln_wage_0, xb // fitted log wage
predictnl appx_dom_est_wage_0 = exp(xb()) * scalar(mean_exp_e) // unbiased fitted wage level
drop exp_ei

* Summary
svy, subpop(if work_state_ibge_name == "Employee"):           mean winc appx_dom_est_wage_0
svy, subpop(if work_state_ibge_name == "Own-account worker"): mean winc appx_dom_est_wage_0


* Plot the conditional work income gap
**************************************

* Observed income as oaw minus predicted income as wage worker
keep if !missing(winc) & work_state_ibge_name == "Own-account worker"
gen cond_inc_gap = winc - appx_dom_est_wage_0
svy: mean cond_inc_gap

gen fweight = round(pweight)
sum cond_inc_gap [fweight = fweight], detail

* Which share at the left side of 0?
numlist "73(.1)76"
global p_list = "`r(numlist)'"
epctile cond_inc_gap, svy percentiles($p_list) 


* FIGURE 6
**********

graph twoway ///
  (pci 0 0 9.75 0, lc(gs4) lp(shortdash)) ///
  (histogram cond_inc_gap if cond_inc_gap > -2500 & cond_inc_gap < 2000, fcolor(gs10) bin(36) percent), ///
		text(9.50 -80 "{bf: OAW income < potential wage}", place(w) justification(right) box bcolor(white)) ///
		text(8.85 -80 "{it: 74.5% of cases}", place(w) justification(right) box bcolor(white)) ///
		text(9.50  40 "{bf: OAW income > potential wage}", place(e) justification(left) box bcolor(white)) ///
		text(8.85  40 "{it: 25.5% of cases}", place(e) justification(left) box bcolor(white)) ///
		ylabel(0 "0%" 1 "1%" 2 "2%" 3 "3%" 4 "4%" 5 "5%" 6 "6%" 7 "7%" 8 "8%", grid) ///
		xlabel(none -2500 "-2.5k" -2000 "-2k" -1500 "-1.5k" -1000 "-1k" -500 "-500" 0 "0" 500 "500" 1000 "1k" 1500 "1.5k" 2000 "2k") ///
        ytitle("") ///
		xtitle("{bf:Current own-account work income minus potential wage (in R{c $|})}") legend(off) ysize(3) xsize(4.5)

graph export "${analysis}/results/figures/fig6.eps", replace


* Store the fitted values for OAWs
**********************************

keep  ind_id appx_dom_est_ln_wage_0 appx_dom_est_wage_0 
order ind_id appx_dom_est_ln_wage_0 appx_dom_est_wage_0
compress
label var appx_dom_est_ln_wage_0 "Estimated log wage (dom worker as employee)" 
label var appx_dom_est_wage_0 "Estimated wage (dom worker as employee)"
describe

merge 1:1 ind_id using "${analysis}/data/4_1_est_potential_wages.dta", nogenerate
describe
save "${analysis}/data/4_1_est_potential_wages.dta", replace


* End of script
***************
cap log close